Hypothesis Tests on Audio Features
Two sample - independent tests
Independent here means that there is no reason to believe that
observations in the two samples can be paired in any way. (i.e. there’s
no reason to believe the danceability in the 60s is the same as the
2010s)
By randomly shuffling (i.e. permuting) the decade labels we lose any
relationship that there was between decade and danceability. Think of
this shuffling as detaching the labels from rows and then randomly
assigning them back to rows. Then we see which of the following
occurs:
If there was no relationship in the first place (i.e. they are in
fact independent) then randomly shuffling them should have no
implication. If the difference between groups in our sample is much
larger than the difference once the labels are shuffled it’s because
there is a real difference between the groups, and it’s not just down to
sampling variation.
This is an example of the hypothesis tests which were carried out on
each audio feature.
H0: The mean danceability in 1960s is the same as the mean
danceability in 2010s
Ha: The mean danceability in 1960s is less than the mean danceability
in 2010s
H0: The difference in means in 0 Ha: danceability2020 -
danceability1960 > 0
How do the audio features change over time
spotify_data_clean_pivot <- spotify_data_clean_join %>%
pivot_longer(cols = acousticness:valence,
names_to = "audio_feature",
values_to = "value") %>%
group_by(year, audio_feature) %>%
mutate(avg_feature = mean(value))
Plot with all Audio Features

Plot with audio features showing change

spotify_data_clean_join %>%
#mutate(explicit = as.numeric(explicit)) %>%
group_by(year) %>%
summarise(total_ex = sum(explicit)) %>%
ggplot() +
aes(x = year, y = total_ex / 20) +
geom_line(colour = "red", linewidth = 2) +
ylim(0, 50) +
labs(x = "Year",
y = "Proportion")+
# title = "Proportion of Explicit Songs",
# subtitle = "by year") +
theme_classic() +
theme(axis.text = element_text(face = "bold", size = 18, family = "Times"),
axis.title = element_text(face = "bold", size = 22, family = "Times"),
title = element_text(face = "bold", size = 18, family = "Times"),
legend.text = element_text(face = "bold", size = 14, family = "Times"),
legend.title = element_blank()
)
ggsave("plot_images/explicit.png", dpi = 720, width = 10, height = 6)

spotify_data_clean_join %>%
group_by(year) %>%
summarise(mean_pop = mean(popularity)) %>%
ggplot() +
aes(x = year, y = mean_pop) +
geom_line(colour = "purple2", linewidth = 2) +
labs(x = "Year",
y = "Popularity") +
ylim(0, 100) +
theme_bw() +
scale_colour_brewer(palette = "Dark2") +
theme(axis.text = element_text(face = "bold", size = 18, family = "Times"),
axis.title = element_text(face = "bold", size = 22, family = "Times"),
title = element_text(face = "bold", size = 18, family = "Times"),
legend.text = element_text(face = "bold", size = 14, family = "Times")
)
ggsave("plot_images/avg_pop.png", dpi = 720, width = 10, height = 6)

spot_sample %>%
ggplot() +
aes(x = danceability, y = popularity) +
geom_point(colour = "#1DB954", alpha = 0.8) +
geom_smooth(method = "lm", se = FALSE, colour = "purple3", linewidth = 2) +
labs(x = "Danceability",
y = "Popularity") +
theme_classic() +
theme(axis.text = element_text(face = "bold", size = 18, family = "Times"),
axis.title = element_text(face = "bold", size = 20, family = "Times"),
title = element_text(face = "bold", size = 18, family = "Times"),
legend.text = element_text(face = "bold", size = 14, family = "Times")
)

#ggsave("plot_images/pop_vs_dance.png", dpi = 720, width = 10, height = 6)
spot_sample %>%
ggplot() +
aes(x = loudness, y = popularity) +
geom_point(colour = "#1DB954", alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, colour = "purple3", linewidth = 2) +
labs(x = "Loudness",
y = "Popularity") +
theme_classic() +
theme(axis.text = element_text(face = "bold", size = 18, family = "Times"),
axis.title = element_text(face = "bold", size = 22, family = "Times"),
title = element_text(face = "bold", size = 18, family = "Times"),
legend.text = element_text(face = "bold", size = 14, family = "Times")
)
ggsave("plot_images/pop_vs_loud.png", dpi = 720, width = 10, height = 6)

Intro To Linear Regression
To help me answer my question of what makes a song “popular” on
spotify, I decided to build an explanatory linear regression model.
This type of analysis is used to determine the strength of the
relationship between a response variable and multiple explanatory
variables.
So in this case I will be choosing from all of the variables I have
just discussed to explain the popularity of a song on spotify.
y = b0 + b1x1 + b2x2 + b3x3….bnxn
popularity = b0 + b1x1 + b2x2 + b3x3….bnxn
To start this process I plot popularity against each one of my
variables or possible explanatory variables and find the strongest
correlation.
n_data <- nrow(spotify_data_for_modelling)
sample_index <- sample(1:n_data, size = n_data*0.1)
spot_sample <- slice(spotify_data_for_modelling, sample_index)
spot_sample %>%
ggplot() +
aes(x = year, y = popularity) +
geom_point(alpha = 0.7, colour = "#1DB954") +
geom_smooth(method = "lm", se = FALSE, colour = "purple2") +
theme_classic() +
labs(x = "Year",
y = "Popularity") +
theme(axis.text = element_text(face = "bold", size = 18, family = "Times"),
axis.title = element_text(face = "bold", size = 18, family = "Times"),
title = element_text(face = "bold", size = 18, family = "Times"),
legend.text = element_text(face = "bold", size = 14, family = "Times")
)

#ggsave("plot_images/pop_vs_dance.png", dpi = 720, width = 10, height = 6)
spot_sample %>%
ggplot() +
aes(x = explicit, y = popularity) +
geom_point(alpha = 0.7, colour = "red4") +
geom_smooth(method = "lm", se = FALSE)

My strongest correlation was year (the year the track was released)
with a correlation of 0.74. I then add this to my model as my first
explanatory variable.
model_1a <- lm(popularity ~ year,
data = train_lm)
summary(model_1a)
Call:
lm(formula = popularity ~ year, data = train_lm)
Residuals:
Min 1Q Median 3Q Max
-19.600 -7.447 -1.773 5.668 54.400
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1220.936768 3.742482 -326.2 <0.0000000000000002 ***
year 0.634644 0.001881 337.4 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 10.08 on 95918 degrees of freedom
Multiple R-squared: 0.5427, Adjusted R-squared: 0.5427
F-statistic: 1.138e+05 on 1 and 95918 DF, p-value: < 0.00000000000000022
After running my model I’m looking at 3 factors: - The P-value - is
this variable making a significant difference. If the P-value is below
the significance level of 0.05 then we can reject the null hypothesis
and conclude that correlation between the variables is significant
- The R^2 - is a measure that indicates how much of the variation of
popularity is explained by the year
- The adjusted R^2 - compensates for the addition of variables. So as
we’re building an explanatory model, we don’t want this to drop much
lower than the r^2.
model_3a <- lm(popularity ~ year + danceability + loudness,
data = train_lm)
summary(model_3a)
model_3b <- lm(popularity ~ year + danceability + instrumentalness,
data = train_lm)
summary(model_3b)
anova(model_3a, model_2c)
Final Model
So here we have our final model. I stopped adding variables as the
Adjusted r^2 started dropping and our multiple r^2 was barely going
up.
So we can see that the more recent the release, the more
danceability, the louder it is, the less liveness it has. So you don’t
want a live recording, you want a studio recording with some explicit
lyrics chucked in there as well.
All of our P-Values are significant and we have a Multiple r^2 of
0.55, with an adjusted r^2 also of 0.55. This means that 55% of the
variance in popularity is explained our other variables. Which also
means that 45% of the variability in the data cannot be explained by
this model.
55% isn’t a very high proportion but it’s not terrible. We are trying
to measure the popularity of a song. It’s not quite as obvious as if we
were trying to explain the value of a house. We might find that the
postcode and the number of rooms goes a long way to explaining that
value. I think explaining the popularity of a song is bit more
complicated than that. If it was easy to know what made a song popular
then we’d all be musicians.
Mention human behaviour, mention the basic practice of statistics
would call this a moderate effect size.
So my model isn’t great. What can I do to improve this? Well…
When I was researching how spotify calculate their popularity score I
kept coming across this one phrase.
50 is the magic number
The higher your popularity index, the more likely the algorithm is to
recommend you to new listeners, and place you in algorithmic playlists
like Release Radar and Discover Weekly. Many websites and blogs had
theories on what number you had to hit to be added to certain editorial
playlists.
DIYMusician suggests that and popularity of 20+ in the first few
weeks of release will get you onto the Release Radar playlist and a
popularity score of 30+ will get you onto Discover Weekly. Loudlab
suggested that 50 is the magic number. I liked this idea of having a
threshold of whether or not a song is popular. So going back to my
variables I created a column called is_popular which is only TRUE if the
popularity of the song is 50 or over. This also lends itself nicely to a
logistic regression model.
Logistic regression is a statistical analysis method to predict, or
explain a binary outcome, such as yes or no, based on prior observations
of a data set.
So rather than using the popularity score I have used the variable I
created called is_popular, which splits the data in to a logical type,
so it’s TRUE if the song is 50 or above.
The way this is built is very similar to the linear model. I look for
correlations, and I add them to my model 1 at a time, check they are
significant. The main difference is that I’m looking for for a high AUC
score this time, rather than the multiple r^squared I was looking for in
the linear model.
I’ll explain the AUC in a second. I decided for this model to make a
couple of changes. I decided to no longer include the year or decade the
song was released. If I’m staying true to my original question then how
could I possibly write a song that was released in the past. It had such
a large influence on the linear model I thought it would be more
interesting to see how the logistic model fared without it. And then I
can test my model against different decades to see how it performs.
Oscar Wilde is famously credited with having written: “Popularity is
the one insult I have never suffered.”
spot_sample %>%
mutate(explicit = as.logical(explicit)) %>%
ggplot() +
aes(x = loudness, y = popularity, colour = explicit) +
geom_point(alpha = 0.8) +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "Loudness",
y = "Popularity",
title = "Relationship between Popularity and Loundess",
subtitle = "Grouped by explicit or not") +
scale_colour_brewer(palette = "Dark2") +
theme_bw() +
theme(axis.text = element_text(face = "bold", size = 15, family = "Times"),
axis.title = element_text(face = "bold", size = 15, family = "Times"),
title = element_text(face = "bold", size = 15, family = "Times"),
legend.text = element_text(face = "bold", size = 10, family = "Times")
)

spot_sample %>%
mutate(explicit = as.logical(explicit)) %>%
ggplot() +
aes(x = loudness, y = as.integer(is_popular), colour = explicit) +
geom_jitter(shape = 1,
position = position_jitter(h = 0.05, w = 0.05),
alpha = 0.8) +
geom_line(data = train_model_1_loud, aes(x = loudness , y = pred), col = 'red') +
ylab("Probability") +
scale_colour_brewer(palette = "Dark2") +
theme_bw() +
theme(axis.text = element_text(face = "bold", size = 15, family = "Times"),
axis.title = element_text(face = "bold", size = 15, family = "Times"),
title = element_text(face = "bold", size = 15, family = "Times"),
legend.text = element_text(face = "bold", size = 10, family = "Times")
)

# ggplot(mortgage_data) +
# geom_jitter(aes(x = tu_score, y = as.integer(accepted)), shape = 1,
# position = position_jitter(h = 0.03)) +
# geom_line(data = predict_log, aes(x = tu_score , y = pred), col = 'red') +
# ylab("Probability")
Final Logistic Regression Model
model_4_final <- glm(is_popular ~ loudness + explicit + danceability + no_of_artists,
family = "binomial",
data = train_log_mod)
summary(model_4_final)
Call:
glm(formula = is_popular ~ loudness + explicit + danceability +
no_of_artists, family = "binomial", data = train_log_mod)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.3012 -0.8550 -0.6624 1.1355 3.7138
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -7.7547047 0.0993408 -78.06 <0.0000000000000002 ***
loudness 0.0780446 0.0012231 63.81 <0.0000000000000002 ***
explicit 1.0409331 0.0237662 43.80 <0.0000000000000002 ***
danceability 0.0095757 0.0004548 21.06 <0.0000000000000002 ***
no_of_artists 0.1844060 0.0114087 16.16 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 121164 on 97324 degrees of freedom
Residual deviance: 110272 on 97320 degrees of freedom
AIC: 110282
Number of Fisher Scoring iterations: 4
tidy(model_4_final)
train_model_4_final <- train_log_mod %>%
add_predictions(model_4_final, type = "response")
roc_obj_mod4 <- train_model_4_final %>%
roc(response = is_popular, predictor = pred)
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
roc_curve <- ggroc(
data = list(
best_model = roc_obj_mod4
),
legacy.axes = TRUE) +
coord_fixed() +
theme_classic()
auc(roc_obj_mod4)
Area under the curve: 0.7223
log_mod_1990 <- train_log_mod %>%
filter(decade == 1990 |
decade == 2000 |
decade == 2010)
model_90_00_10 <- glm(is_popular ~ loudness + explicit + danceability + no_of_artists,
family = "binomial",
data = log_mod_1990)
summary(model_90_00_10)
Call:
glm(formula = is_popular ~ loudness + explicit + danceability +
no_of_artists, family = "binomial", data = log_mod_1990)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.2496 -1.1659 0.8386 1.1431 2.3491
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.1595029 0.1137598 -27.773 < 0.0000000000000002 ***
loudness 0.0342887 0.0014178 24.185 < 0.0000000000000002 ***
explicit 0.4662174 0.0258787 18.016 < 0.0000000000000002 ***
danceability 0.0031530 0.0005672 5.559 0.0000000271 ***
no_of_artists 0.1691662 0.0143345 11.801 < 0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 66344 on 47879 degrees of freedom
Residual deviance: 64758 on 47875 degrees of freedom
AIC: 64768
Number of Fisher Scoring iterations: 4
tidy(model_90_00_10)
train_model_90_00_10 <- log_mod_1990 %>%
add_predictions(model_90_00_10, type = "response")
roc_obj_mod_90_00_10 <- train_model_90_00_10 %>%
roc(response = is_popular, predictor = pred)
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
roc_curve <- ggroc(
data = list(
best_model = roc_obj_mod4,
dec_60s_70s_80s = roc_obj_mod_60_70_80,
dec_90s_00s_10s = roc_obj_mod_90_00_10
),
legacy.axes = TRUE,
linewidth = 2) +
coord_fixed() +
labs(x = "1 - Specificity",
y = "Sensitivity") +
scale_colour_brewer(palette = "Dark2") +
scale_colour_discrete(labels = c("Final Model", "60s, 70s & 80s", "90s, 00s & 10s")) +
theme_classic() +
theme(axis.text = element_text(face = "bold", size = 18, family = "Times"),
axis.title = element_text(face = "bold", size = 18, family = "Times"),
title = element_text(face = "bold", size = 18, family = "Times"),
legend.text = element_text(face = "bold", size = 18, family = "Times"),
#legend.text = element_blank(),
legend.title = element_blank()
#legend.key = element_blank()
)
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
auc(roc_obj_mod_90_00_10)
Area under the curve: 0.629
roc_curve
ggsave("plot_images/log_models.png", dpi = 720, width = 12, height = 6)

log_mod_1990 <- train_log_mod %>%
filter(decade == 1990 |
decade == 2000 |
decade == 2010)
model_90_00_10 <- glm(is_popular ~ loudness + explicit + danceability + no_of_artists,
family = "binomial",
data = log_mod_1990)
summary(model_90_00_10)
Call:
glm(formula = is_popular ~ loudness + explicit + danceability +
no_of_artists, family = "binomial", data = log_mod_1990)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.2496 -1.1659 0.8386 1.1431 2.3491
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.1595029 0.1137598 -27.773 < 0.0000000000000002 ***
loudness 0.0342887 0.0014178 24.185 < 0.0000000000000002 ***
explicit 0.4662174 0.0258787 18.016 < 0.0000000000000002 ***
danceability 0.0031530 0.0005672 5.559 0.0000000271 ***
no_of_artists 0.1691662 0.0143345 11.801 < 0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 66344 on 47879 degrees of freedom
Residual deviance: 64758 on 47875 degrees of freedom
AIC: 64768
Number of Fisher Scoring iterations: 4
tidy(model_90_00_10)
train_model_90_00_10 <- log_mod_1990 %>%
add_predictions(model_90_00_10, type = "response")
roc_obj_mod_90_00_10 <- train_model_90_00_10 %>%
roc(response = is_popular, predictor = pred)
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
roc_curve <- ggroc(
data = list(
best_model = roc_obj_mod4
),
legacy.axes = TRUE,
linewidth = 2) +
coord_fixed() +
labs(x = "1 - Specificity",
y = "Sensitivity") +
scale_colour_brewer(palette = "Dark2") +
scale_colour_discrete(labels = "Final Model") +
theme_classic() +
theme(axis.text = element_text(face = "bold", size = 18, family = "Times"),
axis.title = element_text(face = "bold", size = 18, family = "Times"),
title = element_text(face = "bold", size = 18, family = "Times"),
legend.text = element_text(face = "bold", size = 16, family = "Times"),
legend.title = element_blank()
)
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
auc(roc_obj_mod_90_00_10)
Area under the curve: 0.629
roc_curve
ggsave("plot_images/log_best_model.png", dpi = 720, width = 12, height = 6)

---
title: "Spotify Analysis"
output: html_notebook
---

```{r include=FALSE}
library(spotifyr)
library(tidyverse)
library(modelr)
library(infer)
library(modelr)
library(ggfortify)
library(GGally)
```

```{r include=FALSE}
spotify_data_clean_join <- read_csv("clean_data/spotify_clean_join.csv")
spotify_data <- read_csv("raw_data/spotify_data.csv")
```


### Question

What (audio features) makes a song popular, and has that changed over time?

### Intro

I have been tasked by my client, Universal Records to investigate what makes a song popular on spotify in order to inform future releases. 

For this project I have used a combination of Spotify datasets available on Kaggle, taken from the Spotify Web API. The original dataset contains just under 170,000 rows with 19 columns. I have then joined this with another dataset which gave me information on "genres" and "followers". (and time signature) Each row in the dataset represents 1 song, information about that song and many of it's audio features.

I have used RStudio for this project as I found it excellent for model building.

I had no issues surrounding the ethics of using or presenting this data. I had to get authentication to use the Spotify web api but it is readily available for people to view and analyse.

### The Variables

Basic Info
artists <chr> - the artist or artists who appears on the track
track_name <chr> - name of the song
track_id <chr> - a distinct id to represent the song
year <date> - year the track was released
duration_sec <dbl> - duration of the track in seconds
key <fctr> - the key of the song  0-11
mode <fctr> - major or minor key
time_signature <fctr> - time signature of the song
tempo <dbl> - overall tempo of the track in BPM
explicit <fct> - does the track contain explicit content

decade <date> - decade of release
no_of_artists <dbl> - the number of artists appearing on each track
followers - 
genre - 
is_popular <lgl> - is the popularity rating 50 or over


All on a scale of 0 - 100
 - acousticness <dbl> - a measure of how acoustic the track is (was it recorded in a live setting or studio)

 - danceability <dbl> - how suitable is the track for dancing, based on rhythm stability, tempo and beat strength.

- energy - measures the intensity and activity level

- instrumentalness - measures how much vocals there are in a track. The higher the number, the less vocals in the track

- liveness - detects the presence of an audience in the recording. The higher the liveness the higher the probability it was recorded live.

- loudness - the overall loudness of a track. Originally in db, scaled to match the other audio features.

- speechiness - measures the presence of spoken word in a track. The closer to 100, the more exclusively speech-like the recording. Rap has higher score than folk.

- valence - the higher the value the more positive a track sounds(e.g. happy, cheerful). The lower the score the more negative it sounds (e.g. sad, angry)

### popularity -  Spotify -  "The popularity of a track is a value between 0 and 100, with 100 being the most popular. The popularity is calculated by algorithm and is based, in the most part, on the total number of plays the track has had and how recent those plays are."


## Hypothesis Tests on Audio Features

Two sample - independent tests

Independent here means that there is no reason to believe that observations in the two samples can be paired in any way. (i.e. there's no reason to believe the danceability in the 60s is the same as the 2010s)

By randomly shuffling (i.e. permuting) the decade labels we lose any relationship that there was between decade and danceability. Think of this shuffling as detaching the labels from rows and then randomly assigning them back to rows. Then we see which of the following occurs:

If there was no relationship in the first place (i.e. they are in fact independent) then randomly shuffling them should have no implication.
If the difference between groups in our sample is much larger than the difference once the labels are shuffled it’s because there is a real difference between the groups, and it’s not just down to sampling variation.

This is an example of the hypothesis tests which were carried out on each audio feature.

H0: The mean danceability in 1960s is the same as the mean danceability in 2010s

Ha: The mean danceability in 1960s is less than the mean danceability in 2010s

H0: The difference in means in 0
Ha: danceability2020 - danceability1960 > 0
```{r include=FALSE}
dance_decade_hyp <- spotify_data_clean_join %>% 
  select(decade, danceability) %>% 
  filter(decade == 1960 | decade == 2010) %>% 
  mutate(decade = as.factor(decade))

null_distribution <- dance_decade_hyp %>% 
  specify(danceability ~ decade) %>% 
  hypothesize(null = "independence") %>% 
  generate(reps = 5000, type = "permute") %>% 
  calculate(stat = "diff in means", order = c("2010", "1960")) 

observed_stat <- dance_decade_hyp %>% 
  specify(danceability ~ decade) %>%
  calculate(stat = "diff in means", order = c("2010", "1960"))

null_distribution %>%
  visualise() +
  shade_p_value(obs_stat = observed_stat, direction = "right")

p_value <- null_distribution %>%
  get_p_value(obs_stat = observed_stat, direction = "right")
```

### How do the audio features change over time

```{r}
spotify_data_clean_pivot <- spotify_data_clean_join %>% 
  pivot_longer(cols = acousticness:valence,
              names_to = "audio_feature",
              values_to = "value") %>% 
  group_by(year, audio_feature) %>% 
  mutate(avg_feature = mean(value))
```


Plot with all Audio Features
```{r}
audiot_features_all <- spotify_data_clean_pivot %>%
  ggplot() +
  aes(x = year, y = avg_feature, group = audio_feature, 
      colour = audio_feature) +
  geom_line(linewidth = 2) +
  ylim(0, 100) +
  labs(x = "Year",
       y = "Value",
       title = "Audio Features Over Time",
       subtitle = "Avg per year",
       colour = "Audio Feature") +
  theme_bw() +
  scale_colour_brewer(palette =  "Dark2") +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 18, family = "Times"),
        title = element_text(face = "bold", size = 15, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  ) 

#ggsave(audiot_features_all, "plot_images/audio_features_all.png")
```

Plot with audio features showing change
```{r}
audio_features_cut <- spotify_data_clean_pivot %>%
    filter(audio_feature == "acousticness" |
           audio_feature == "instrumentalness" |
           audio_feature == "danceability" |
           audio_feature == "speechiness" |
           audio_feature == "loudness") %>% 
  ggplot() +
  aes(x = year, y = avg_feature, group = audio_feature, 
      colour = audio_feature) +
  geom_line(linewidth = 2) +
  ylim(0, 100) +
  labs(x = "Year",
       y = "Value",
       # title = "Audio Features by Release Year",
       # subtitle = "Avg per year",
       colour = "Audio Feature") +
  theme_bw() +
  scale_colour_brewer(palette =  "Dark2") +
  scale_colour_discrete(labels = c("Acousticness", "Danceability", "Instrumentalness",
                                   "Loudness", "Speechiness")) +

  theme(axis.text  = element_text(face = "bold", size = 20, family = "Times"),
        axis.title  = element_text(face = "bold", size = 22, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 20, family = "Times"),
        legend.title = element_blank()
  ) 

ggsave("plot_images/audio_features_cut.png", dpi = 720, width = 12, height = 6)

?ggsave
```

```{r}
spotify_data_clean_join %>% 
  #mutate(explicit = as.numeric(explicit)) %>% 
  group_by(year) %>% 
  summarise(total_ex = sum(explicit)) %>% 
  ggplot() +
  aes(x = year, y = total_ex / 20) +
  geom_line(colour = "red", linewidth = 2) +
  ylim(0, 50) +
  labs(x = "Year",
       y = "Proportion")+
       # title = "Proportion of Explicit Songs",
       # subtitle = "by year") +
  theme_classic() +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 22, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  ) 

ggsave("plot_images/explicit.png", dpi = 720, width = 10, height = 6)
```
```{r}
spotify_data_clean_join %>% 
  group_by(year) %>% 
  summarise(mean_pop = mean(popularity)) %>% 
  ggplot() +
  aes(x = year, y = mean_pop) +
  geom_line(colour = "purple2", linewidth = 2) +
  labs(x = "Year",
       y = "Popularity") +
  ylim(0, 100) +
  theme_bw() +
  scale_colour_brewer(palette =  "Dark2") +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 22, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  ) 

ggsave("plot_images/avg_pop.png", dpi = 720, width = 10, height = 6)

```

```{r}
spot_sample %>% 
  ggplot() +
  aes(x = danceability, y = popularity) +
  geom_point(colour = "#1DB954", alpha = 0.8) +
  geom_smooth(method = "lm", se = FALSE, colour = "purple3", linewidth = 2) +
  labs(x = "Danceability",
       y = "Popularity") +
       theme_classic() +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 20, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  )

#ggsave("plot_images/pop_vs_dance.png", dpi = 720, width = 10, height = 6)

```

```{r}
spot_sample %>% 
  ggplot() +
  aes(x = loudness, y = popularity) +
  geom_point(colour = "#1DB954", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, colour = "purple3", linewidth = 2) +
  labs(x = "Loudness",
       y = "Popularity") +
       theme_classic() +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 22, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  )

ggsave("plot_images/pop_vs_loud.png", dpi = 720, width = 10, height = 6)

```

### Intro To Linear Regression

To help me answer my question of what makes a song "popular" on spotify, I decided to build an explanatory linear regression model. 

This type of analysis is used to determine the strength of the relationship between a response variable and multiple explanatory variables.

So in this case I will be choosing from all of the variables I have just discussed to explain the popularity of a song on spotify.

y = b0 + b1x1 + b2x2 + b3x3....bnxn

popularity = b0 + b1x1 + b2x2 + b3x3....bnxn

To start this process I plot popularity against each one of my variables or possible explanatory variables and find the strongest correlation.

```{r}
n_data <- nrow(spotify_data_for_modelling)

sample_index <- sample(1:n_data, size = n_data*0.1)

spot_sample <- slice(spotify_data_for_modelling, sample_index)

spot_sample %>% 
  ggplot() +
  aes(x = year, y = popularity) +
  geom_point(alpha = 0.7, colour = "#1DB954") +
  geom_smooth(method = "lm", se = FALSE, colour = "purple2") +
  theme_classic() +
  labs(x = "Year",
       y = "Popularity") +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 18, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  )

#ggsave("plot_images/pop_vs_dance.png", dpi = 720, width = 10, height = 6)
```

```{r}
spot_sample %>% 
  ggplot() +
  aes(x = explicit, y = popularity) +
  geom_point(alpha = 0.7, colour = "red4") +
  geom_smooth(method = "lm", se = FALSE)
```


My strongest correlation was year (the year the track was released) with a correlation of 0.74. I then add this to my model as my first explanatory variable.

```{r}
model_1a <- lm(popularity ~ year,
               data = train_lm)

summary(model_1a)
```

After running my model I'm looking at 3 factors:
 - The P-value - is this variable making a significant difference. If the P-value is below the significance level of 0.05 then we can reject the null hypothesis and conclude that correlation between the variables is significant
 
 - The R^2 - is a measure that indicates how much of the variation of popularity is explained by the year
 - The adjusted R^2 - compensates for the addition of variables. So as we're building an explanatory model, we don't want this to drop much lower than the r^2.

```{r}
model_3a <- lm(popularity ~ year + danceability + loudness,
               data = train_lm)

summary(model_3a)
```

```{r}
model_3b <- lm(popularity ~ year + danceability + instrumentalness,
               data = train_lm)

summary(model_3b)
```

```{r}
anova(model_3a, model_2c)
```

Final Model

```{r}
model_6a <- lm(popularity ~ year + danceability + loudness + liveness + explicit,
               data = train_lm)

summary(model_6a)
``` 

So here we have our final model. I stopped adding variables as the Adjusted r^2 started dropping and our multiple r^2 was barely going up.

So we can see that the more recent the release, the more danceability, the louder it is, the less liveness it has. So you don't want a live recording, you want a studio recording with some explicit lyrics chucked in there as well.

All of our P-Values are significant and we have a Multiple r^2 of 0.55, with an adjusted r^2 also of 0.55. This means that 55% of the variance in popularity is explained our other variables. Which  also means that 45% of the variability in the data cannot be explained by this model.

55% isn't a very high proportion but it's not terrible. We are trying to measure the popularity of a song. It's not quite as obvious as if we were trying to explain the value of a house. We might find that the postcode and the number of rooms goes a long way to explaining that value. I think explaining the popularity of a song is bit more complicated than that. If it was easy to know what made a song popular then we'd all be musicians.

Mention human behaviour, mention the basic practice of statistics would call this a moderate effect size. 


```{r}
model_6b <- lm(popularity ~ year + danceability + loudness + liveness + explicit,
               data = test_lm)

summary(model_6b)

``` 

So my model isn't great. What can I do to improve this? Well...

When I was researching how spotify calculate their popularity score I kept coming across this one phrase.

### 50 is the magic number

The higher your popularity index, the more likely the algorithm is to recommend you to new listeners, and place you in algorithmic playlists like Release Radar and Discover Weekly. Many websites and blogs had theories on what number you had to hit to be added to certain editorial playlists. 

DIYMusician suggests that and popularity of 20+ in the first few weeks of release will get you onto the Release Radar playlist and a popularity score of 30+ will get you onto Discover Weekly. Loudlab suggested that 50 is the magic number. I liked this idea of having a threshold of whether or not a song is popular. So going back to my variables I created a column called is_popular which is only TRUE if the popularity of the song is 50 or over. This also lends itself nicely to a logistic regression model. 

Logistic regression is a statistical analysis method to predict, or explain a binary outcome, such as yes or no, based on prior observations of a data set.

So rather than using the popularity score I have used the variable I created called is_popular, which splits the data in to a logical type, so it's TRUE if the song is 50 or above.

The way this is built is very similar to the linear model. I look for correlations, and I add them to my model 1 at a time, check they are significant. The main difference is that I'm looking for for a high AUC score this time, rather than the multiple r^squared I was looking for in the linear model.

I'll explain the AUC in a second. I decided for this model to make a couple of changes. I decided to no longer include the year or decade the song was released. If I'm staying true to my original question then how could I possibly write a song that was released in the past. It had such a large influence on the linear model I thought it would be more interesting to see how the logistic model fared without it. And then I can test my model against different decades to see how it performs.


Oscar Wilde is famously credited with having written: “Popularity is the one insult I have never suffered.”

```{r}
spot_sample %>% 
  mutate(explicit = as.logical(explicit)) %>% 
  ggplot() +
  aes(x = loudness, y = popularity, colour = explicit) +
  geom_point(alpha = 0.8) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Loudness",
       y = "Popularity",
       title = "Relationship between Popularity and Loundess",
       subtitle = "Grouped by explicit or not") +
    scale_colour_brewer(palette =  "Dark2") +
  theme_bw() +
  theme(axis.text  = element_text(face = "bold", size = 15, family = "Times"),
        axis.title  = element_text(face = "bold", size = 15, family = "Times"),
        title = element_text(face = "bold", size = 15, family = "Times"),
        legend.text = element_text(face = "bold", size = 10, family = "Times")
  ) 
```


```{r}
spot_sample %>% 
  mutate(explicit = as.logical(explicit)) %>%
ggplot() +
  aes(x = loudness, y = as.integer(is_popular), colour = explicit) +
  geom_jitter(shape = 1,
              position = position_jitter(h = 0.05, w = 0.05),
              alpha = 0.8) +
   geom_line(data = train_model_1_loud, aes(x = loudness , y = pred), col = 'red') +
  ylab("Probability") +
      scale_colour_brewer(palette =  "Dark2") +
  theme_bw() +
  theme(axis.text  = element_text(face = "bold", size = 15, family = "Times"),
        axis.title  = element_text(face = "bold", size = 15, family = "Times"),
        title = element_text(face = "bold", size = 15, family = "Times"),
        legend.text = element_text(face = "bold", size = 10, family = "Times")
  ) 


# ggplot(mortgage_data) +
#   geom_jitter(aes(x = tu_score, y = as.integer(accepted)), shape = 1, 
#               position = position_jitter(h = 0.03)) + 
#    geom_line(data = predict_log, aes(x = tu_score , y = pred), col = 'red') + 
#   ylab("Probability")
```


Final Logistic Regression Model
```{r}
model_4_final <- glm(is_popular ~ loudness + explicit + danceability + no_of_artists,
             family = "binomial",
             data = train_log_mod)

summary(model_4_final)

tidy(model_4_final)

train_model_4_final <- train_log_mod %>%
  add_predictions(model_4_final, type = "response")

roc_obj_mod4 <- train_model_4_final %>%
  roc(response = is_popular, predictor = pred)

roc_curve <- ggroc(
  data = list(
    best_model = roc_obj_mod4
  ), 
  legacy.axes = TRUE) +
  coord_fixed() + 
  theme_classic()

auc(roc_obj_mod4)
```


```{r}
log_mod_1990 <- train_log_mod %>% 
  filter(decade == 1990 |
           decade == 2000 |
           decade == 2010)

model_90_00_10 <- glm(is_popular ~ loudness + explicit + danceability + no_of_artists,
                      family = "binomial",
                      data = log_mod_1990)

summary(model_90_00_10)

tidy(model_90_00_10)

train_model_90_00_10 <- log_mod_1990 %>%
  add_predictions(model_90_00_10, type = "response")

roc_obj_mod_90_00_10 <- train_model_90_00_10 %>%
  roc(response = is_popular, predictor = pred)


roc_curve <- ggroc(
  data = list(
    best_model = roc_obj_mod4,
    dec_60s_70s_80s = roc_obj_mod_60_70_80,
    dec_90s_00s_10s = roc_obj_mod_90_00_10
  ), 
  legacy.axes = TRUE,
  linewidth = 2) +
  coord_fixed() +
  labs(x = "1 - Specificity",
       y = "Sensitivity") +
  scale_colour_brewer(palette =  "Dark2") +
  scale_colour_discrete(labels = c("Final Model", "60s, 70s & 80s", "90s, 00s & 10s")) +
  theme_classic() +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 18, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 18, family = "Times"),
        #legend.text = element_blank(),
        legend.title = element_blank()
        #legend.key = element_blank()
  )

auc(roc_obj_mod_90_00_10)

roc_curve

ggsave("plot_images/log_models.png", dpi = 720, width = 12, height = 6)

```


```{r}

log_mod_1990 <- train_log_mod %>% 
  filter(decade == 1990 |
           decade == 2000 |
           decade == 2010)

model_90_00_10 <- glm(is_popular ~ loudness + explicit + danceability + no_of_artists,
                      family = "binomial",
                      data = log_mod_1990)

summary(model_90_00_10)

tidy(model_90_00_10)

train_model_90_00_10 <- log_mod_1990 %>%
  add_predictions(model_90_00_10, type = "response")

roc_obj_mod_90_00_10 <- train_model_90_00_10 %>%
  roc(response = is_popular, predictor = pred)


roc_curve <- ggroc(
  data = list(
    best_model = roc_obj_mod4
  ), 
  legacy.axes = TRUE,
  linewidth = 2) +
  coord_fixed() +
  labs(x = "1 - Specificity",
       y = "Sensitivity") +
  scale_colour_brewer(palette =  "Dark2") +
  scale_colour_discrete(labels = "Final Model") +
  theme_classic() +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 18, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 16, family = "Times"),
        legend.title = element_blank()
  )

auc(roc_obj_mod_90_00_10)

roc_curve

ggsave("plot_images/log_best_model.png", dpi = 720, width = 12, height = 6)

```




